In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Question 5¶

In [121]:
from itertools import combinations
import random
def square_distance(x,y): return sum([(xi-yi)**2 for xi, yi in zip(x,y)])
In [122]:
#Defining a function to calculate Maximum Distance and Minimum Distance between any 2 points in a vector
def max_min(dimension):
    samples = [[random.uniform(-5, 5) for dim in range(dimension)] for point in range(500)]
    distances=[]
    for pair in combinations(samples,2):
        distances.append(np.sqrt(square_distance(*pair)))

    return np.log10((max(distances)-min(distances))/min(distances)), max(distances), min(distances)
In [132]:
for i in range(2,50):
    print(max_min(i)[0])
3.0827042213579423
2.234452730861751
1.7103018935845447
1.4080086363939766
1.215888827976844
1.0510437907827654
1.0423369400305575
0.9345490814522746
0.8457517512396465
0.9494415019362359
0.6519767867852018
0.692642124877255
0.63602079082911
0.5659303015181862
0.4821850902835167
0.44985791925812224
0.4225677552286724
0.4083421044710428
0.45193505900402414
0.4177823561420615
0.47456482937817257
0.3019186492950519
0.3273742665057678
0.3432260346577939
0.34394747309117607
0.31752845505850663
0.28098575159621014
0.2861124598081354
0.2555417061249269
0.27717786017957735
0.2341418785371315
0.25756621695483717
0.18291959773657138
0.18399795109616374
0.1599502684133582
0.14844392378723814
0.19214047516827798
0.1397053637160367
0.12666856629750634
0.13414273391142661
0.0866104195697205
0.12616337672837857
0.07299217441470486
0.14992303814984598
0.06576140660642892
0.08203111555438576
0.0702780239982885
0.059939290263023866
In [142]:
x=list(range(2,50))
y=[]
for i in x:
    y.append(max_min(i)[0])
plt.plot(x, y)
plt.title("Curse of dimensionality")
plt.xlabel("number of dimensions")
plt.ylabel("log10((max_dist-min_dist)/min_dist)")
plt.show()

Question 4¶

In [2]:
data= pd.read_csv('T2D_abundance.csv', delimiter= '\t')
data.head()
Out[2]:
Unnamed: 0 k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae k__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Acidobacteriaceae_unclassified k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_graevenitzii k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_odontolyticus k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_turicensis k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Varibaculum|s__Varibaculum_cambriense k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Rothia|s__Rothia_mucilaginosa ... k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Halomonas|s__Halomonas_boliviensis k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_kanaloae k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Xanthomonas|s__Xanthomonas_axonopodis k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Xanthomonas|s__Xanthomonas_fuscans k__Bacteria|p__Tenericutes|c__Mollicutes|o__Mycoplasmatales|f__Mycoplasmataceae|g__Mycoplasma|s__Mycoplasma_bovis k__Bacteria|p__Bacteroidetes|c__Flavobacteriia|o__Flavobacteriales|f__Flavobacteriaceae|g__Zunongwangia|s__Zunongwangia_profunda k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_pallens k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Rhodopirellula|s__Rhodopirellula_unclassified k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_furnissii Class
0 con-001 0.33364 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
1 con-002 0.49776 0.12802 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
2 con-003 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.01254 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
3 con-004 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.02847 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
4 con-005 0.49446 0.06786 0.0 0.0 0.0 0.0 0.0 0.0 0.02221 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n

5 rows × 574 columns

In [3]:
data.shape
Out[3]:
(344, 574)
In [4]:
data['Class'].value_counts()
Out[4]:
n      174
t2d    170
Name: Class, dtype: int64
In [5]:
data.iloc[0,:]
Out[5]:
Unnamed: 0                                                                                                                                              con-001
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii         0.33364
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified        0.0
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae                  0.0
k__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Acidobacteriaceae_unclassified                                   0.0
                                                                                                                                                         ...   
k__Bacteria|p__Bacteroidetes|c__Flavobacteriia|o__Flavobacteriales|f__Flavobacteriaceae|g__Zunongwangia|s__Zunongwangia_profunda                            0.0
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_pallens                                          0.0
k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Rhodopirellula|s__Rhodopirellula_unclassified                   0.0
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_furnissii                                           0.0
Class                                                                                                                                                         n
Name: 0, Length: 574, dtype: object
In [6]:
data= data.set_index('Unnamed: 0')
data.head()
Out[6]:
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae k__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Acidobacteriaceae_unclassified k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_graevenitzii k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_odontolyticus k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_turicensis k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Varibaculum|s__Varibaculum_cambriense k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Rothia|s__Rothia_mucilaginosa k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Rothia|s__Rothia_unclassified ... k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Halomonas|s__Halomonas_boliviensis k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_kanaloae k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Xanthomonas|s__Xanthomonas_axonopodis k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Xanthomonas|s__Xanthomonas_fuscans k__Bacteria|p__Tenericutes|c__Mollicutes|o__Mycoplasmatales|f__Mycoplasmataceae|g__Mycoplasma|s__Mycoplasma_bovis k__Bacteria|p__Bacteroidetes|c__Flavobacteriia|o__Flavobacteriales|f__Flavobacteriaceae|g__Zunongwangia|s__Zunongwangia_profunda k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_pallens k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Rhodopirellula|s__Rhodopirellula_unclassified k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_furnissii Class
Unnamed: 0
con-001 0.33364 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
con-002 0.49776 0.12802 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
con-003 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.01254 0.00262 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
con-004 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.02847 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n
con-005 0.49446 0.06786 0.0 0.0 0.0 0.0 0.0 0.0 0.02221 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 n

5 rows × 573 columns

In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
In [8]:
data_copy= data.drop(columns=['Class'])
pca = PCA(n_components=2)
components = pca.fit_transform(data_copy)
components.shape
Out[8]:
(344, 2)
In [9]:
#PCA METHOD
import plotly.express as px

data_copy= data.drop(columns=['Class'])
pca = PCA(n_components=2)
components = pca.fit_transform(data_copy)
fig = px.scatter(components, x=0, y=1, color=data['Class'])
fig.show()
In [10]:
#t-SNE METHOD
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_copy)
fig = px.scatter(tsne_results, x=0, y=1, color=data['Class'])
fig.show()
C:\Users\user\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:795: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

C:\Users\user\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:805: FutureWarning:

The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 344 samples in 0.002s...
[t-SNE] Computed neighbors for 344 samples in 0.173s...
[t-SNE] Computed conditional probabilities for sample 344 / 344
[t-SNE] Mean sigma: 13.097886
[t-SNE] KL divergence after 250 iterations with early exaggeration: 68.297318
[t-SNE] KL divergence after 300 iterations: 1.091797
In [ ]: